In [80]:
import numpy as np
import matplotlib.pylab as py
import pandas as pa
import scipy.stats as st
np.set_printoptions(precision=2)
%matplotlib inline
In this section we show a few example of discrete random variables using Python.
The documentation for these routines can be found at:
In [81]:
X=st.bernoulli(p=0.3)
X.rvs(100)
Out[81]:
In [82]:
# Note that "high" is not included.
X=st.randint(low=1,high=5)
X.rvs(100)
Out[82]:
The documentation for these routines can be found at:
In [86]:
XUniform=st.uniform(loc=0.7,scale=0.3);
# "bins" tells you how many bars to use
# "normed" says to turn the counts into probability densities
py.hist(XUniform.rvs(1000000),bins=20,normed=True);
x = np.linspace(-0.1,1.1,100)
py.plot(x,XUniform.pdf(x))
#py.savefig('Figures/uniformPDF.png')
Out[86]:
In [87]:
py.plot(XUniform.cdf(x))
py.savefig('Figures/uniformCDF.png')
In [93]:
XNormal=st.norm(loc=0,scale=1);
# "bins" tells you how many bars to use
# "normed" says to turn the counts into probability densities
py.hist(XNormal.rvs(1000),bins=100,normed=True);
x = np.linspace(-3,3,100)
py.plot(x,XNormal.pdf(x))
#py.savefig('Figures/normalPDF.png')
Out[93]:
In [94]:
py.plot(XNormal.cdf(x))
py.savefig('Figures/normalCDF.png')
Now we can look at the histograms of some of our data from Case Study 2.
In [95]:
data = pa.read_hdf('data.h5','movies')
In [96]:
data
Out[96]:
In [97]:
data['title'][100000]
Out[97]:
In [98]:
X=data.pivot_table('rating',index='timestamp',aggfunc='count')
In [99]:
X.plot()
Out[99]:
In [100]:
# Warning: Some versions of Pandas use "index" and "columns", some use "rows" and "cols"
X=data.pivot_table('rating',index='title',aggfunc='sum')
#X=data.pivot_table('rating',rows='title',aggfunc='sum')
In [101]:
X
Out[101]:
In [102]:
X.hist()
Out[102]:
In [103]:
# Warning: Some versions of Pandas use "index" and "columns", some use "rows" and "cols"
X=data.pivot_table('rating',index='occupation',aggfunc='sum')
#X=data.pivot_table('rating',rows='occupation',aggfunc='sum')
In [104]:
X
Out[104]:
Here we show an example of the central limit theorem. You can play around with "numberOfDistributions" and "numberOfSamples" to see how quickly this converges to something that looks Gaussian.
In [113]:
numberOfDistributions = 100
numberOfSamples = 1000
XTest = st.uniform(loc=0,scale=1);
# The same thing works with many distributions.
#XTest = st.lognorm(s=1.0);
XCLT=np.zeros([numberOfSamples])
for i in range(numberOfSamples):
for j in range(numberOfDistributions):
XCLT[i] += XTest.rvs()
XCLT[i] = XCLT[i]/numberOfDistributions
In [114]:
py.hist(XCLT,normed=True)
Out[114]:
Some basic ideas in Linear Algebra and how you can use them in Python.
In [115]:
import numpy as np
In [116]:
a=np.array([1,2,3])
a
Out[116]:
In [117]:
A=np.matrix(np.random.randint(1,10,size=[3,3]))
A
Out[117]:
In [118]:
x=np.matrix([[1],[2],[3]])
print x
print x.T
In [119]:
a*a
Out[119]:
In [120]:
np.dot(a,a)
Out[120]:
In [123]:
x.T*x
Out[123]:
In [124]:
A*x
Out[124]:
In [125]:
b = np.matrix([[5],[6],[7]])
b
Out[125]:
In [126]:
Ai = np.linalg.inv(A)
print A
print Ai
In [127]:
A*Ai
Out[127]:
In [129]:
Ai*A
Out[129]:
In [130]:
xHat = Ai*b
xHat
Out[130]:
In [131]:
print A*xHat
print b
In [133]:
sizes = range(100,1000,200)
times = np.zeros(len(sizes))
for i in range(len(sizes)):
A = np.random.random(size=[sizes[i],sizes[i]])
x = %timeit -o np.linalg.inv(A)
times[i] = x.best
py.plot(sizes,times)
Out[133]:
Sparse matrices (those with lots of 0s) can often be worked with much more efficiently than general matrices than standard methods.
In [134]:
from scipy.sparse.linalg import spsolve
from scipy.sparse import rand,eye
mySize = 1000
A=rand(mySize,mySize,0.001)+eye(mySize)
b=np.random.random(size=[mySize])
The sparsity structure of A.
In [135]:
py.spy(A,markersize=0.1)
Out[135]:
In [136]:
dense = %timeit -o np.linalg.solve(A.todense(),b)
In [137]:
sparse = %timeit -o spsolve(A,b)
In [138]:
dense.best/sparse.best
Out[138]:
Pandas provides many routines for computing statistics.
In [46]:
XNormal=st.norm(loc=0.7,scale=2);
x = XNormal.rvs(1000000)
print np.mean(x)
print np.std(x)
print np.var(x)
But empirical measures are not always good approximations of the true properties of the distribution.
In [47]:
sizes = np.arange(16)+1
errors = np.zeros(16)
for i in range(16):
x = XNormal.rvs(2**i)
errors[i] = np.abs(0.7-np.mean(x))
py.plot(sizes,errors)
py.plot(sizes,2/np.sqrt(sizes))
py.plot(sizes,2*2/np.sqrt(sizes),'r')
py.savefig('Figures/errorInMean.png')
In [48]:
data.pivot_table?
In [49]:
X=data.pivot_table('rating',index='title',aggfunc='mean')
#X=data.pivot_table('rating',rows='title',aggfunc='mean')
In [50]:
hist(X)
In [51]:
X=data.pivot_table('rating',index='title',columns='gender',aggfunc='mean')
#X=data.pivot_table('rating',rows='title',cols='gender',aggfunc='mean')
In [52]:
py.subplot(1,2,1)
X['M'].hist()
py.subplot(1,2,2)
X['F'].hist()
Out[52]:
In [53]:
py.plot(X['M'],X['F'],'.')
Out[53]:
In [54]:
X.cov()
Out[54]:
In [55]:
X.corr()
Out[55]:
In [56]:
X=data.pivot_table('rating',index='occupation',columns='gender',aggfunc='mean')
#X=data.pivot_table('rating',rows='occupation',cols='gender',aggfunc='mean')
In [57]:
X
Out[57]:
In [ ]: